"""Build .docx from paper.md + all output tables and IRF plots."""

import re
from pathlib import Path
from docx import Document
from docx.shared import Inches, Pt, Cm, RGBColor
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.enum.table import WD_TABLE_ALIGNMENT
from docx.oxml.ns import qn

PROJECT = Path("/mnt/c/demographics_capital_flows/capital_deepening")
TABLES_DIR = PROJECT / "output" / "tables"
PAPER_MD = None
for _name in ["capital_deepening_paper_20260221_v1.md", "capital_deepening_paper_20260220.md", "paper.md"]:
    _p = PROJECT / "paper" / _name
    if _p.exists():
        PAPER_MD = _p
        break
if PAPER_MD is None:
    PAPER_MD = PROJECT / "paper" / "paper.md"  # fallback for error msg
OUTPUT = PROJECT / "paper" / "capital_deepening_paper_20260305_r2.docx"

# ── helpers ──────────────────────────────────────────────────────────

def set_cell_text(cell, text, bold=False, size=Pt(9), font_name='Times New Roman'):
    """Set cell text with formatting."""
    cell.text = ""
    p = cell.paragraphs[0]
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    run = p.add_run(text)
    run.font.size = size
    run.font.name = font_name
    run.bold = bold

def add_md_table(doc, md_text, title=None):
    """Parse a markdown table and add it as a Word table."""
    lines = [l.strip() for l in md_text.strip().split('\n') if l.strip()]
    # Find table lines (contain |)
    table_lines = [l for l in lines if '|' in l and not l.startswith('#')]
    if not table_lines:
        return

    # Parse rows
    rows = []
    for line in table_lines:
        cells = [c.strip() for c in line.split('|')]
        cells = [c for c in cells if c != '']  # remove empty from leading/trailing |
        # Skip separator rows
        if all(set(c) <= set('-: ') for c in cells):
            continue
        rows.append(cells)

    if len(rows) < 2:
        return

    # Add title if provided
    if title:
        p = doc.add_paragraph()
        run = p.add_run(title)
        run.bold = True
        run.font.size = Pt(11)
        run.font.name = 'Times New Roman'

    n_cols = max(len(r) for r in rows)
    table = doc.add_table(rows=len(rows), cols=n_cols)
    table.style = 'Light Shading'
    table.alignment = WD_TABLE_ALIGNMENT.CENTER

    for i, row_data in enumerate(rows):
        for j, cell_text in enumerate(row_data):
            if j < n_cols:
                is_header = (i == 0)
                set_cell_text(table.cell(i, j), cell_text, bold=is_header)

    doc.add_paragraph()  # spacing


def add_ascii_irf(doc, text, title=None):
    """Add ASCII IRF plot as monospace block."""
    if title:
        p = doc.add_paragraph()
        run = p.add_run(title)
        run.bold = True
        run.font.size = Pt(10)
        run.font.name = 'Times New Roman'

    p = doc.add_paragraph()
    p.paragraph_format.space_before = Pt(2)
    p.paragraph_format.space_after = Pt(2)
    run = p.add_run(text)
    run.font.size = Pt(7)
    run.font.name = 'Courier New'


def parse_md_file(filepath):
    """Parse a .md table file into sections of tables and code blocks."""
    text = filepath.read_text()
    sections = []

    # Split into code blocks and non-code blocks
    parts = re.split(r'(```[\s\S]*?```)', text)

    for part in parts:
        if part.startswith('```'):
            # Code block - ASCII IRF
            content = part.strip('`').strip()
            if content.startswith('\n'):
                content = content[1:]
            sections.append(('irf', content))
        else:
            # Check for tables and headers
            lines = part.split('\n')
            current_title = None
            table_buf = []

            for line in lines:
                if line.startswith('#'):
                    # Flush any pending table
                    if table_buf:
                        sections.append(('table', current_title, '\n'.join(table_buf)))
                        table_buf = []
                    current_title = re.sub(r'^#+\s*', '', line).strip()
                elif '|' in line:
                    table_buf.append(line)
                elif line.strip().startswith('*') and not table_buf:
                    sections.append(('note', line.strip()))

            if table_buf:
                sections.append(('table', current_title, '\n'.join(table_buf)))

    return sections


# ── main ──────────────────────────────────────────────────────────────

def build_docx():
    doc = Document()

    # Page setup
    for section in doc.sections:
        section.top_margin = Cm(2.54)
        section.bottom_margin = Cm(2.54)
        section.left_margin = Cm(2.54)
        section.right_margin = Cm(2.54)

    # Default font
    style = doc.styles['Normal']
    style.font.name = 'Times New Roman'
    style.font.size = Pt(12)
    style.paragraph_format.space_after = Pt(6)
    style.paragraph_format.line_spacing = 1.15

    # ── Parse paper.md and render ──
    paper_text = PAPER_MD.read_text()
    lines = paper_text.split('\n')

    # Track whether we're in the title-page metadata block (between title and first ##)
    in_title_block = False

    i = 0
    while i < len(lines):
        line = lines[i]

        # Title (# heading)
        if line.startswith('# ') and not line.startswith('## '):
            title = line[2:].strip()
            p = doc.add_heading(title, level=0)
            p.alignment = WD_ALIGN_PARAGRAPH.CENTER
            for run in p.runs:
                run.font.size = Pt(16)
                run.font.name = 'Times New Roman'
                run.font.color.rgb = RGBColor(0, 0, 0)
            in_title_block = True
            i += 1
            continue

        # End title block at first section heading
        if line.startswith('## '):
            in_title_block = False

        # Section heading (##)
        if line.startswith('## '):
            heading = line[3:].strip()
            p = doc.add_heading(heading, level=1)
            for run in p.runs:
                run.font.size = Pt(14)
                run.font.name = 'Times New Roman'
                run.font.color.rgb = RGBColor(0, 0, 0)
            i += 1
            continue

        # Subsection heading (###)
        if line.startswith('### '):
            heading = line[4:].strip()
            p = doc.add_heading(heading, level=2)
            for run in p.runs:
                run.font.size = Pt(12)
                run.font.name = 'Times New Roman'
                run.font.color.rgb = RGBColor(0, 0, 0)
            i += 1
            continue

        # Math blocks ($$)
        if line.strip().startswith('$$'):
            stripped = line.strip()
            # Single-line $$...$$: equation is self-contained
            if stripped.endswith('$$') and len(stripped) > 4:
                math_lines = [stripped[2:-2]]
                i += 1
            else:
                # Multi-line: opening $$ on this line, closing $$ on a later line
                math_lines = [stripped.replace('$$', '')]
                i += 1
                while i < len(lines) and '$$' not in lines[i]:
                    math_lines.append(lines[i].strip())
                    i += 1
                if i < len(lines):
                    math_lines.append(lines[i].strip().replace('$$', ''))
                    i += 1
            math_text = ' '.join(l for l in math_lines if l)
            # Clean up LaTeX
            # LaTeX symbol replacements (order matters)
            math_text = math_text.replace('\\widehat', '')
            math_text = math_text.replace('\\hat', '')
            math_text = math_text.replace('\\text{', '')
            math_text = math_text.replace('\\log', 'log')
            math_text = math_text.replace('\\cdots', '...')
            math_text = math_text.replace('\\cdot', '·')
            math_text = math_text.replace('\\varepsilon', 'ε')
            math_text = math_text.replace('\\alpha', 'α')
            math_text = math_text.replace('\\beta', 'β')
            math_text = math_text.replace('\\gamma', 'γ')
            math_text = math_text.replace('\\delta', 'δ')
            math_text = math_text.replace('\\Delta', 'Δ')
            math_text = math_text.replace('\\Lambda', 'Λ')
            math_text = math_text.replace('\\sum', 'Σ')
            math_text = math_text.replace('\\exp', 'exp')
            math_text = math_text.replace('\\times', '×')
            math_text = math_text.replace('\\left(', '(')
            math_text = math_text.replace('\\right)', ')')
            math_text = math_text.replace('\\left[', '[')
            math_text = math_text.replace('\\right]', ']')
            math_text = math_text.replace('\\Pr', 'Pr')
            math_text = math_text.replace('\\leq', '≤')
            math_text = math_text.replace('\\geq', '≥')
            math_text = math_text.replace('\\approx', '≈')
            math_text = math_text.replace('\\neq', '≠')
            math_text = math_text.replace('\\sim', '~')
            math_text = math_text.replace('\\phi', 'φ')
            math_text = math_text.replace('\\prime', '′')
            math_text = math_text.replace('\\div', '÷')
            math_text = math_text.replace('\\tau', 'τ')
            math_text = math_text.replace('\\mu', 'μ')
            math_text = math_text.replace('\\sigma', 'σ')
            math_text = math_text.replace('\\in', '∈')
            math_text = math_text.replace('\\{', '{')
            math_text = math_text.replace('\\}', '}')
            math_text = math_text.replace('\\_', '_')
            # Handle subscripts/superscripts before stripping braces
            math_text = re.sub(r'_\{([^}]+)\}', r'_\1', math_text)
            math_text = re.sub(r'\^\{([^}]+)\}', r'^\1', math_text)
            # Clean remaining braces
            math_text = math_text.replace('{', '').replace('}', '')
            p = doc.add_paragraph()
            p.alignment = WD_ALIGN_PARAGRAPH.CENTER
            run = p.add_run(math_text)
            run.font.size = Pt(10)
            run.font.name = 'Cambria Math'
            run.italic = True
            continue

        # Markdown table in paper body
        if '|' in line and line.strip().startswith('|'):
            table_lines = []
            while i < len(lines) and '|' in lines[i]:
                table_lines.append(lines[i])
                i += 1
            add_md_table(doc, '\n'.join(table_lines))
            continue

        # Regular paragraph
        if line.strip():
            p = doc.add_paragraph()
            if in_title_block:
                p.alignment = WD_ALIGN_PARAGRAPH.CENTER
            # Parse inline formatting
            text = line.strip()
            # Pre-process significance stars (e.g., 8.9***) so they don't
            # interfere with markdown bold/italic parsing
            text = re.sub(r'(\d)\*\*\*', r'\1⁂⁂⁂', text)
            text = re.sub(r'(\d)\*\*(?!\*)', r'\1⁂⁂', text)
            text = re.sub(r'(\d)\*(?!\*)', r'\1⁂', text)
            # Process bold and italic
            parts = re.split(r'(\*\*\*[^*]+\*\*\*|\*\*[^*]+\*\*|\*[^*]+\*|\$[^$]+\$)', text)
            for part in parts:
                if part.startswith('***') and part.endswith('***'):
                    run = p.add_run(part[3:-3])
                    run.bold = True
                    run.italic = True
                    run.font.name = 'Times New Roman'
                    run.font.size = Pt(12)
                elif part.startswith('**') and part.endswith('**'):
                    run = p.add_run(part[2:-2])
                    run.bold = True
                    run.font.name = 'Times New Roman'
                    run.font.size = Pt(12)
                elif part.startswith('*') and part.endswith('*') and len(part) > 2:
                    run = p.add_run(part[1:-1])
                    run.italic = True
                    run.font.name = 'Times New Roman'
                    run.font.size = Pt(12)
                elif part.startswith('$') and part.endswith('$'):
                    # Inline math
                    math = part[1:-1]
                    for old, new in [('\\hat{\\beta}', 'β̂'),
                                     ('\\hat{\\gamma}', 'γ̂'),
                                     ('\\hat\\gamma', 'γ̂'),
                                     ('\\widehat', ''), ('\\hat', ''),
                                     ('\\text{', ''),
                                     ('\\beta', 'β'), ('\\gamma', 'γ'),
                                     ('\\Delta', 'Δ'), ('\\times', '×'),
                                     ('\\varepsilon', 'ε'), ('\\alpha', 'α'),
                                     ('\\delta', 'δ'), ('\\Lambda', 'Λ'),
                                     ('\\phi', 'φ'), ('\\Pr', 'Pr'),
                                     ('\\log', 'log'), ('\\exp', 'exp'),
                                     ('\\sum', 'Σ'), ('\\cdot', '·'),
                                     ('\\cdots', '...'),
                                     ('\\approx', '≈'), ('\\leq', '≤'),
                                     ('\\geq', '≥'), ('\\neq', '≠'),
                                     ('\\sim', '~'), ('\\prime', '′'),
                                     ('\\left(', '('), ('\\right)', ')'),
                                     ('\\left[', '['), ('\\right]', ']'),
                                     ('\\div', '÷'), ('\\tau', 'τ'),
                                     ('\\mu', 'μ'), ('\\sigma', 'σ'),
                                     ('\\in', '∈'),
                                     ('\\{', '{'), ('\\}', '}'),
                                     ('\\_', '_')]:
                        math = math.replace(old, new)
                    math = re.sub(r'_\{([^}]+)\}', r'_\1', math)
                    math = re.sub(r'\^\{([^}]+)\}', r'^\1', math)
                    math = math.replace('{', '').replace('}', '')
                    run = p.add_run(math)
                    run.italic = True
                    run.font.name = 'Cambria Math'
                    run.font.size = Pt(11)
                else:
                    # Restore significance stars
                    restored = part.replace('⁂⁂⁂', '***').replace('⁂⁂', '**').replace('⁂', '*')
                    run = p.add_run(restored)
                    run.font.name = 'Times New Roman'
                    run.font.size = Pt(12)

        i += 1

    # ── Page break before appendix tables ──
    doc.add_page_break()
    p = doc.add_heading('Appendix: Tables and Impulse Response Functions', level=0)
    p.alignment = WD_ALIGN_PARAGRAPH.CENTER
    for run in p.runs:
        run.font.size = Pt(16)
        run.font.name = 'Times New Roman'
        run.font.color.rgb = RGBColor(0, 0, 0)

    # ── Add all tables ──
    table_files = [
        ("Table 1: Summary Statistics", "summary_statistics.md"),
        ("Table 2: Allocation Puzzle", "allocation_puzzle.md"),
        ("Table 3: Demographics and Capital Deepening", "capital_deepening.md"),
        ("Table 4: Flow-Outcome IV Regressions", "flow_outcomes.md"),
        ("Table 5: Absorptive Capacity", "absorptive_capacity.md"),
        ("Table 6: OLS vs. IV Comparison", "ols_iv_comparison.md"),
        ("Table 7: Robustness Checks", "robustness.md"),
        ("Table 8: Local Projections — Dynamic Effects", "local_projections.md"),
        ("Table 9: J-Curve Robustness Tests", "jcurve_robustness.md"),
        ("Table 10: First-Differenced Local Projections", "firstdiff_lp.md"),
        ("Table 11: Augmented Local Projections — Pre-Trend Fix", "pretrend_fix.md"),
        ("Table 12: Pre-Trend Diagnosis", "pretrend_diagnosis.md"),
        ("Table 13: K/L Decomposition (Reviewer Response)", "phase12_kl_decomposition.md"),
        ("Table 14: Reshuffled-ΔZ Permutation Placebo", "phase12_shuffled_placebo.md"),
        ("Table 15: Exclusion Restriction Tests", "phase12_exclusion_tests.md"),
        ("Table 16: Absorptive Capacity Stratification", "phase12_absorptive_stratification.md"),
        ("Table 17: LP Implementation Details", "phase12_lp_specification.md"),
    ]

    for table_num, (label, filename) in enumerate(table_files):
        filepath = TABLES_DIR / filename
        if not filepath.exists():
            continue

        doc.add_page_break()

        # Table title
        p = doc.add_heading(label, level=1)
        for run in p.runs:
            run.font.size = Pt(13)
            run.font.name = 'Times New Roman'
            run.font.color.rgb = RGBColor(0, 0, 0)

        # Parse file
        sections = parse_md_file(filepath)

        for section in sections:
            if section[0] == 'table':
                _, title, md = section
                if title and title != label:
                    add_md_table(doc, md, title=title)
                else:
                    add_md_table(doc, md)
            elif section[0] == 'irf':
                # Split into individual IRF blocks
                irf_text = section[1]
                # Split on double newline before title lines (===)
                blocks = re.split(r'\n\n+(?=[A-Z\(])', irf_text)
                for block in blocks:
                    block = block.strip()
                    if not block:
                        continue
                    # Extract title (first line with === under it)
                    block_lines = block.split('\n')
                    title = None
                    content_start = 0
                    if len(block_lines) > 1 and '===' in block_lines[1]:
                        title = block_lines[0]
                        content_start = 2
                    elif len(block_lines) > 1 and '---' in block_lines[1]:
                        title = block_lines[0]
                        content_start = 2

                    content = '\n'.join(block_lines[content_start:])
                    add_ascii_irf(doc, content, title=title)
            elif section[0] == 'note':
                p = doc.add_paragraph()
                run = p.add_run(section[1])
                run.italic = True
                run.font.size = Pt(9)
                run.font.name = 'Times New Roman'

    # Save
    doc.save(str(OUTPUT))
    print(f"Saved: {OUTPUT}")
    print(f"Size: {OUTPUT.stat().st_size / 1024:.0f} KB")


if __name__ == '__main__':
    build_docx()
